Data Science Project - Spotify¶
by Bruno Miguel Pereira Gomes, Lorenzo Adam Piazza
In [2]:
# Load datasets
import json
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as ps
df_songs = pd.read_csv('songs_extended.csv', index_col = 'id')
df_songs['release_date'] = pd.to_datetime(df_songs['release_date'], format = '%Y-%m-%d', errors = 'coerce')
df_genres = pd.read_csv('genres.csv', index_col = 'genre')
df_artists = pd.read_csv('artists.csv', index_col = 'id')
df_artists['first_song'] = pd.to_datetime(df_artists['first_song'], format = '%Y-%m-%d', errors = 'coerce')
unnormalized_history = pd.read_json('my_spotify_data/Spotify Extended Streaming History/Streaming_History_Audio_2018-2023.json', typ='series')
df_allHistory = pd.json_normalize(unnormalized_history["songs"])
In [2]:
df_songs.info()
<class 'pandas.core.frame.DataFrame'> Index: 1204025 entries, 7lmeHLHBe4nmXzuXc0HDjk to 3GgQmOxxLyRoAb4j86zOBX Data columns (total 26 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 1204025 non-null object 1 album 1204025 non-null object 2 album_id 1204025 non-null object 3 artists 1204025 non-null object 4 artist_ids 1204025 non-null object 5 track_number 1204025 non-null int64 6 disc_number 1204025 non-null int64 7 explicit 1204025 non-null bool 8 danceability 1204025 non-null float64 9 energy 1204025 non-null float64 10 key 1204025 non-null int64 11 loudness 1204025 non-null float64 12 mode 1204025 non-null int64 13 speechiness 1204025 non-null float64 14 acousticness 1204025 non-null float64 15 instrumentalness 1204025 non-null float64 16 liveness 1204025 non-null float64 17 valence 1204025 non-null float64 18 tempo 1204025 non-null float64 19 duration_ms 1204025 non-null int64 20 time_signature 1204025 non-null float64 21 year 1204025 non-null int64 22 release_date 1204015 non-null datetime64[ns] 23 genres 1204025 non-null object 24 categories 1204025 non-null object 25 artist_popularity 1204025 non-null int64 dtypes: bool(1), datetime64[ns](1), float64(10), int64(7), object(7) memory usage: 240.0+ MB
In [3]:
df_songs.head()
Out[3]:
| name | album | album_id | artists | artist_ids | track_number | disc_number | explicit | danceability | energy | ... | liveness | valence | tempo | duration_ms | time_signature | year | release_date | genres | categories | artist_popularity | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| id | |||||||||||||||||||||
| 7lmeHLHBe4nmXzuXc0HDjk | Testify | The Battle Of Los Angeles | 2eia0myWFgoHuttJytCxgX | ['Rage Against The Machine'] | ['2d0hyoQ5ynDBnkvAbJKORj'] | 1 | 1 | False | 0.470 | 0.978 | ... | 0.3560 | 0.503 | 117.906 | 210133 | 4.0 | 1999 | 1999-11-02 | ['alternative metal', 'alternative rock', 'con... | ['hiphop', 'metal', 'other', 'rap', 'rock'] | 68 |
| 1wsRitfRRtWyEapl0q22o8 | Guerrilla Radio | The Battle Of Los Angeles | 2eia0myWFgoHuttJytCxgX | ['Rage Against The Machine'] | ['2d0hyoQ5ynDBnkvAbJKORj'] | 2 | 1 | True | 0.599 | 0.957 | ... | 0.1550 | 0.489 | 103.680 | 206200 | 4.0 | 1999 | 1999-11-02 | ['alternative metal', 'alternative rock', 'con... | ['hiphop', 'metal', 'other', 'rap', 'rock'] | 68 |
| 1hR0fIFK2qRG3f3RF70pb7 | Calm Like a Bomb | The Battle Of Los Angeles | 2eia0myWFgoHuttJytCxgX | ['Rage Against The Machine'] | ['2d0hyoQ5ynDBnkvAbJKORj'] | 3 | 1 | False | 0.315 | 0.970 | ... | 0.1220 | 0.370 | 149.749 | 298893 | 4.0 | 1999 | 1999-11-02 | ['alternative metal', 'alternative rock', 'con... | ['hiphop', 'metal', 'other', 'rap', 'rock'] | 68 |
| 2lbASgTSoDO7MTuLAXlTW0 | Mic Check | The Battle Of Los Angeles | 2eia0myWFgoHuttJytCxgX | ['Rage Against The Machine'] | ['2d0hyoQ5ynDBnkvAbJKORj'] | 4 | 1 | True | 0.440 | 0.967 | ... | 0.1210 | 0.574 | 96.752 | 213640 | 4.0 | 1999 | 1999-11-02 | ['alternative metal', 'alternative rock', 'con... | ['hiphop', 'metal', 'other', 'rap', 'rock'] | 68 |
| 1MQTmpYOZ6fcMQc56Hdo7T | Sleep Now In the Fire | The Battle Of Los Angeles | 2eia0myWFgoHuttJytCxgX | ['Rage Against The Machine'] | ['2d0hyoQ5ynDBnkvAbJKORj'] | 5 | 1 | False | 0.426 | 0.929 | ... | 0.0789 | 0.539 | 127.059 | 205600 | 4.0 | 1999 | 1999-11-02 | ['alternative metal', 'alternative rock', 'con... | ['hiphop', 'metal', 'other', 'rap', 'rock'] | 68 |
5 rows × 26 columns
In [4]:
df_genres.info()
<class 'pandas.core.frame.DataFrame'> Index: 5265 entries, 2-step to zydeco Data columns (total 1 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 category 5265 non-null object dtypes: object(1) memory usage: 82.3+ KB
In [5]:
df_genres.head()
Out[5]:
| category | |
|---|---|
| genre | |
| 2-step | other |
| 21st century classical | classic |
| 432hz | other |
| 48g | other |
| 5th wave emo | other |
In [6]:
df_artists.info()
<class 'pandas.core.frame.DataFrame'> Index: 140064 entries, 7F4HcalxCMC4DctguvnoFY to 1tsNLehJWv67iMipy0WwQR Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 name 140061 non-null object 1 genres 140064 non-null object 2 category 140064 non-null object 3 followers 140064 non-null int64 4 popularity 140064 non-null int64 5 first_song 139540 non-null datetime64[ns] dtypes: datetime64[ns](1), int64(2), object(3) memory usage: 7.5+ MB
In [7]:
df_artists.head()
Out[7]:
| name | genres | category | followers | popularity | first_song | |
|---|---|---|---|---|---|---|
| id | ||||||
| 7F4HcalxCMC4DctguvnoFY | Ace Wilder | ['melodipop', 'swedish pop'] | ['pop'] | 29030 | 23 | 2014-02-23 |
| 21QpKYI2IXZRv21rLrbNZy | Rolf Gjelsten | [] | [] | 2 | 0 | 2015-02-03 |
| 1gQ1kamk2cHpy59Zyd1TPp | Sophia McKenna | [] | [] | 9 | 15 | 1989-11-01 |
| 5XDrmlSI7dXpJkiPb3Oqd4 | Underdog | ['hardcore', 'new jersey hardcore', 'nyhc'] | ['other'] | 10586 | 14 | 1996-01-01 |
| 58wjOQFamj752bO2Bbjjug | Veer | ['dubstep', 'indian electronic', 'riddim dubst... | ['electronic', 'other'] | 6198 | 10 | 2020-01-24 |
In [8]:
# Distribution of song durations (0 - 10 minutes)
df_songs['duration_sec'] = df_songs['duration_ms'] // 1000
df_songs['duration_min'] = df_songs['duration_sec'] / 60
df_songs['is_over_10_mins'] = df_songs['duration_min'] > 10
num_of_songs = len(df_songs)
num_of_songs_below_10_minutes = num_of_songs - df_songs["is_over_10_mins"].sum()
num_of_songs_above_10_minutes = df_songs["is_over_10_mins"].sum()
print(f'Number of songs below 10 minutes: { num_of_songs_below_10_minutes } ( { num_of_songs_below_10_minutes / num_of_songs * 100 }% )')
print(f'Number of songs over 10 minutes: { num_of_songs_above_10_minutes } ( { num_of_songs_above_10_minutes / num_of_songs * 100 }% )')
print(f'Average song duration: {df_songs["duration_min"].mean():.0f} minutes')
px.histogram(data_frame = df_songs, x = 'duration_min', range_x = [0, 10], nbins = 500) \
.update_layout(title = 'Distribution of song durations (0 - 10 minutes)', xaxis_title = 'Song duration (Minutes)', yaxis_title = 'Number of Songs')
Number of songs below 10 minutes: 1176239 ( 97.69224060962189% ) Number of songs over 10 minutes: 27786 ( 2.3077593903781066% ) Average song duration: 4 minutes